1 Setup

assess <- read_csv("anonymisedData/assessments.csv")
course <- read_csv("anonymisedData/courses.csv")
student.assess <- read_csv("anonymisedData/studentAssessment.csv")
student.info <- read_csv("anonymisedData/studentInfo.csv")
student.regio <- read_csv("anonymisedData/studentRegistration.csv")
student.vle <- read_csv("anonymisedData/studentVle.csv")
vle <- read_csv("anonymisedData/vle.csv")
table(assess$code_module)  # 7 modules
## 
## AAA BBB CCC DDD EEE FFF GGG 
##  12  42  20  35  15  52  30
glimpse(assess)
## Observations: 206
## Variables: 6
## $ code_module       <chr> "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA…
## $ code_presentation <chr> "2013J", "2013J", "2013J", "2013J", "2013J", "…
## $ id_assessment     <dbl> 1752, 1753, 1754, 1755, 1756, 1757, 1758, 1759…
## $ assessment_type   <chr> "TMA", "TMA", "TMA", "TMA", "TMA", "Exam", "TM…
## $ date              <dbl> 19, 54, 117, 166, 215, NA, 19, 54, 117, 166, 2…
## $ weight            <dbl> 10, 20, 20, 20, 30, 100, 10, 20, 20, 20, 30, 1…
glimpse(course)
## Observations: 22
## Variables: 3
## $ code_module                <chr> "AAA", "AAA", "BBB", "BBB", "BBB", "B…
## $ code_presentation          <chr> "2013J", "2014J", "2013J", "2014J", "…
## $ module_presentation_length <dbl> 268, 269, 268, 262, 240, 234, 269, 24…
glimpse(student.assess)
## Observations: 173,912
## Variables: 5
## $ id_assessment  <dbl> 1752, 1752, 1752, 1752, 1752, 1752, 1752, 1752, 1…
## $ id_student     <dbl> 11391, 28400, 31604, 32885, 38053, 45462, 45642, …
## $ date_submitted <dbl> 18, 22, 17, 26, 19, 20, 18, 19, 9, 18, 19, 18, 17…
## $ is_banked      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ score          <dbl> 78, 70, 72, 69, 79, 70, 72, 72, 71, 68, 73, 67, 7…
glimpse(student.info)  # students demoraphics + final results for all students
## Observations: 32,593
## Variables: 12
## $ code_module          <chr> "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "…
## $ code_presentation    <chr> "2013J", "2013J", "2013J", "2013J", "2013J"…
## $ id_student           <dbl> 11391, 28400, 30268, 31604, 32885, 38053, 4…
## $ gender               <chr> "M", "F", "F", "F", "F", "M", "M", "F", "F"…
## $ region               <chr> "East Anglian Region", "Scotland", "North W…
## $ highest_education    <chr> "HE Qualification", "HE Qualification", "A …
## $ imd_band             <chr> "90-100%", "20-30%", "30-40%", "50-60%", "5…
## $ age_band             <chr> "55<=", "35-55", "35-55", "35-55", "0-35", …
## $ num_of_prev_attempts <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ studied_credits      <dbl> 240, 60, 60, 60, 60, 60, 60, 120, 90, 60, 6…
## $ disability           <chr> "N", "N", "Y", "N", "N", "N", "N", "N", "N"…
## $ final_result         <chr> "Pass", "Pass", "Withdrawn", "Pass", "Pass"…
glimpse(student.regio)  # student registration details
## Observations: 32,593
## Variables: 5
## $ code_module         <chr> "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "A…
## $ code_presentation   <chr> "2013J", "2013J", "2013J", "2013J", "2013J",…
## $ id_student          <dbl> 11391, 28400, 30268, 31604, 32885, 38053, 45…
## $ date_registration   <dbl> -159, -53, -92, -52, -176, -110, -67, -29, -…
## $ date_unregistration <dbl> NA, NA, 12, NA, NA, NA, NA, NA, NA, NA, NA, …
glimpse(student.vle)  # interaction with virtual learning envir
## Observations: 10,655,280
## Variables: 6
## $ code_module       <chr> "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA…
## $ code_presentation <chr> "2013J", "2013J", "2013J", "2013J", "2013J", "…
## $ id_student        <dbl> 28400, 28400, 28400, 28400, 28400, 28400, 2840…
## $ id_site           <dbl> 546652, 546652, 546652, 546614, 546714, 546652…
## $ date              <dbl> -10, -10, -10, -10, -10, -10, -10, -10, -10, -…
## $ sum_click         <dbl> 4, 1, 1, 11, 1, 8, 2, 15, 17, 1, 1, 1, 3, 4, 3…
glimpse(vle)
## Observations: 6,364
## Variables: 6
## $ id_site           <dbl> 546943, 546712, 546998, 546888, 547035, 546614…
## $ code_module       <chr> "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA…
## $ code_presentation <chr> "2013J", "2013J", "2013J", "2013J", "2013J", "…
## $ activity_type     <chr> "resource", "oucontent", "resource", "url", "r…
## $ week_from         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ week_to           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

2 Data in the cohort

2.1 Assessments

There are 7 modules (corses) and for each module there can be 3 assessment types (not all present in every module)

table(assess$assessment_type, assess$code_module)
##       
##        AAA BBB CCC DDD EEE FFF GGG
##   CMA    0  15   8   7   0  28  18
##   Exam   2   4   4   4   3   4   3
##   TMA   10  23   8  24  12  20   9

2.2 Students

n.modules <- student.info %>% group_by(id_student) %>% summarise(len.modules = length(code_module))
  • There is a total of 28785 students enrolled and a student attended a minimum of 1 to a maximum of 5.

  • There are demographics data and assessment results for all students that registered in a module.

sum(!(student.assess$id_student %in% student.info$id_student))
## [1] 0
sum(!(student.info$id_student %in% student.assess$id_student))
## [1] 5847
sum(!(student.info$id_student %in% student.assess$id_student))
## [1] 5847
sum(!(student.info$id_student %in% student.regio$id_student))
## [1] 0
  • In theory, if a student also has an unregistration date they should not have a final score for an exam for a module. Indeed, Students who unregistered have Withdrawal as the value of the final_result column in the studentInfo.csv file. However, this is not always the case and I will clean up students with a different final_result entry.

  • Assessment are submitted throghout the module (different dates) and if a student does not submit an assessment then there is no record.

  • There are 23351 students with at least an assessment available with a valid score.

  • All student with an assessment have demographics available.

  • 5416 students do not have an assessment available.

  • There are 2711 students who did not interact with the VLE

  • There are 25 students with an assessment did not interact with the VLE.

sum(!(student.info$id_student %in% student.vle$id_student))
## [1] 2852
sum(!(student.assess$id_student %in% student.vle$id_student))
## [1] 77
  • A student might have several entries in student.info but not all assignment
  • This means that if I merge the assignment score with student infos, I will have some studnet.infos entries with NAs for the score.
  • Also some students have NAs for their scores in the assessment data

3 Merge datasets

3.1 VLE

vle.combine <- vle %>% left_join(student.vle) %>% dplyr::rename(date.vle = date)

3.2 Assessment infos

# Connect assessment types scores with scores
ass <- student.assess %>% left_join(assess)

summary(ass$score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0    65.0    80.0    75.8    90.0   100.0     173
table(sign(ass$date_submitted), ass$code_module, ass$code_presentation)
## , ,  = 2013B
## 
##     
##        AAA   BBB   CCC   DDD   EEE   FFF   GGG
##   -1     0    40     0    58     0    59     0
##   0      0     0     0     1     0     5     0
##   1      0 11216     0 10314     0 12131     0
## 
## , ,  = 2013J
## 
##     
##        AAA   BBB   CCC   DDD   EEE   FFF   GGG
##   -1     0   270     0   172     0   194     0
##   0      0     3     0     0     0     3     0
##   1   1633 14102     0  7764  2884 16043  5950
## 
## , ,  = 2014B
## 
##     
##        AAA   BBB   CCC   DDD   EEE   FFF   GGG
##   -1     0   194     0    74     1   156     0
##   0      0     7     0     2     0     0     0
##   1      0  9792  7489  4466  1779 10040  4896
## 
## , ,  = 2014J
## 
##     
##        AAA   BBB   CCC   DDD   EEE   FFF   GGG
##   -1    42    37   137   278    39   290    16
##   0      0    30     0     0     0    10     0
##   1   1474  7341 11314  7735  3190 15884  4357
table(sign(ass$date_submitted), ass$assessment_type)
##     
##        CMA  Exam   TMA
##   -1   641     0  1416
##   0     18     0    43
##   1  69868  4959 96967
ass$date.sign <- sign(ass$date_submitted)
ggplot(ass, aes(x = factor(date.sign), y = score, fill = code_module)) + geom_boxplot()

ass <- ass %>% filter(!is.na(score))
  • There are 0 NAs assessment scores and 2055 assignments were submitted before the start of the course. Given the large number and my limited knolwledge about the dataset I will keep them in the study. They might be entry tests. However, I will remove the NAs assessment score.

  • On the online resource it says: date – information about the final submission date of the assessment calculated as the number of days SINCE the start of the module-presentation. The starting date of the presentation has number 0 (zero). How can it be negative??

  • Still after filtering this leaves me with 173739 assessments across 23351 students.

3.3 Students demographics

# Now I know which student what id assessment did.
combine <- student.info %>% # code_module , code_pres, id_student, demo, final score.
  left_join(student.regio) %>% 
    left_join(course) 

4 Create combined features: assignment score and number of clicks

summary(ass$score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    65.0    80.0    75.8    90.0   100.0
ass.total <- ass %>% group_by(code_module, code_presentation, id_student) %>% 
    summarise(comb.score = mean(score))

click.total <- vle.combine %>% group_by(code_module, code_presentation, id_student) %>% 
    summarise(total.clicks = sum(sum_click))

# Merge assessment and clicks
click.score <- ass.total %>% left_join(click.total)

5 Filter students and recode variables

In this first analysis I am interested in understanding what are the factors that influence withdrawal (1 = withdrawl, 0 = PASS/FAIL/Distinction) and Success (1 = PASS/Distinction, 0 = FAIL). The factors influencing withdrawal could be very different from the factors leading to failure, e.g. external factors versus student’s capabilities.

I will create a total weigthed score for the assigment and exams scores available in each module. Some students, if they withdrew do not have score for a module so there will be NAs once combining this meta-score with student infos. I also created a total score for the number of clicks in the VLE done in one module in each semester. I will also create a variable with total number of students in each module.

5.1 Unregistered/Registered students

All students who unregistered during the course could only be Withdrawn as final result since they left the course.
However, it appears that some students that never unregistered still withdrawn from the module. Also, Some students never registered but still they unregistered or they are in course??

combine$Unregistered <- ifelse(is.na(combine$date_unregistration), "in course", 
    "unregistered")
combine$Registered <- ifelse(is.na(combine$date_registration), "not registered", 
    "registered")

sum.reg <- combine %>% group_by(Registered, Unregistered, final_result) %>% 
    summarise(n.students = length(unique(id_student)))

knitr::kable(sum.reg, caption = "Number of students by registration date, unregistration date and final result.")
Number of students by registration date, unregistration date and final result.
Registered Unregistered final_result n.students
not registered in course Fail 4
not registered in course Pass 1
not registered unregistered Withdrawn 38
registered in course Distinction 2903
registered in course Fail 6799
registered in course Pass 11880
registered in course Withdrawn 93
registered unregistered Fail 9
registered unregistered Withdrawn 9043
combine <- combine %>% filter(!(Registered %in% "not registered"), !(Unregistered %in% 
    "unregistered" & Registered %in% "registered" & final_result %in% "Fail"))

We have now 28748 students.

dim(combine)
## [1] 32539    17
combine.feat <- combine %>% left_join(click.score)
dim(combine.feat)
## [1] 32539    19

5.2 Modify some variables: categorical to numeric

nas <- apply(combine.feat, 2, function(x) sum(is.na(x)))
nas
##                code_module          code_presentation 
##                          0                          0 
##                 id_student                     gender 
##                          0                          0 
##                     region          highest_education 
##                          0                          0 
##                   imd_band                   age_band 
##                       1111                          0 
##       num_of_prev_attempts            studied_credits 
##                          0                          0 
##                 disability               final_result 
##                          0                          0 
##          date_registration        date_unregistration 
##                          0                      22515 
## module_presentation_length               Unregistered 
##                          0                          0 
##                 Registered                 comb.score 
##                          0                       6725 
##               total.clicks 
##                       6775
table(combine.feat$highest_education)
## 
##       A Level or Equivalent            HE Qualification 
##                       14022                        4723 
##          Lower Than A Level             No Formal quals 
##                       13135                         346 
## Post Graduate Qualification 
##                         313
table(combine.feat$num_of_prev_attempts)
## 
##     0     1     2     3     4     5     6 
## 28376  3292   674   142    38    13     4
combine.feat <- combine.feat %>% mutate(withdraw = ifelse(final_result %in% 
    "Withdrawn", 1, 0), imd_band_num = as.numeric(as.factor(imd_band)), highest_education = factor(highest_education, 
    levels = c("No Formal quals", "Lower Than A Level", "A Level or Equivalent", 
        "HE Qualification", "Post Graduate Qualification")), num_of_prev_attempts_dich = ifelse(num_of_prev_attempts > 
    0, ">0", "0"), year = str_remove(code_presentation, pattern = "B|J"), semester = str_extract(code_presentation, 
    pattern = "B|J"), code_presentation = factor(code_presentation, levels = c("2013B", 
    "2014B", "2013J", "2014J")))

6 Keep one entry per student

  • There are students who attempted the same module multiple times and students who attempted different modules. The replicates account for roughly 12% of the entries. To avoid more complicated random intercept models, if the same student attempted the same module twice then I will keep the entry with the highest number of num_of_prev_attempts because this information should include information about previous attempts.
  • I will discard randomly all students who attempted different modules. If these aren’t removed, the entries won’t be independent and it would require introducing a mixed effect model with random intercept for each subject. The model itself is a logistic model with many independent variables and fitting a mixed model could be quite complicated and the majority of the students anyway only have one entry.
repl <- names(table(combine.feat$id_student)[table(combine.feat$id_student) > 
    1])
repl.student <- combine.feat[combine.feat$id_student %in% repl, ]

repl.student.sum <- repl.student %>% group_by(id_student) %>% summarise(n.modules = length(unique(code_module)), 
    n.results = length(unique(final_result)), n.repl = length(code_module))

ggplot(repl.student.sum, aes(x = factor(n.modules), fill = factor(n.results))) + 
    geom_bar(position = "dodge") + facet_wrap(~n.repl) + theme_bw()

student.to.keep <- combine.feat %>% group_by(code_module, id_student) %>% summarise(code_presentation = code_presentation[which.max(num_of_prev_attempts)]) %>% 
    unite(key.student.module, code_module, id_student, code_presentation, sep = "-", 
        remove = FALSE)

combine.feat.unique <- combine.feat %>% unite(key.student.module, code_module, 
    id_student, code_presentation, sep = "-", remove = FALSE) %>% filter(key.student.module %in% 
    student.to.keep$key.student.module)

combine.feat.unique <- combine.feat.unique[!duplicated(combine.feat.unique$id_student), 
    ]

nrow(combine.feat.unique)/nrow(combine.feat)  # removed 12% of the initial data
## [1] 0.8834937
# Total number of students attending a course - includign students who
# attempted different modules
nstudents <- combine.feat %>% group_by(code_module, code_presentation) %>% summarise(n.students = length(unique(id_student)))

combine.feat.unique <- combine.feat.unique %>% left_join(nstudents)
sum.reg <- combine.feat.unique %>% group_by(Registered, Unregistered, final_result) %>% 
    summarise(n.students = length(unique(id_student)))
colnames(sum.reg) <- c("Registered", "Unregistered", "Final result", "Number of students")

knitr::kable(sum.reg, caption = "Number of students by registration date, unregistration date and final result.")
Number of students by registration date, unregistration date and final result.
Registered Unregistered Final result Number of students
registered in course Distinction 2684
registered in course Fail 6548
registered in course Pass 11119
registered in course Withdrawn 91
registered unregistered Withdrawn 8306

7 Combine VLE clicks data with students’ results

In 2014 there was an increased activity on the VLe which is probably why it helped improving success.

vle.combine.sum <- vle.combine %>% group_by(code_module, code_presentation, 
    activity_type, id_student) %>% summarise(sum_click = sum(sum_click)) %>% 
    unite(key.student.module, code_module, id_student, code_presentation, sep = "-", 
        remove = FALSE) %>% filter(key.student.module %in% student.to.keep$key.student.module) %>% 
    mutate(year = str_remove(code_presentation, pattern = "B|J"), semester = str_extract(code_presentation, 
        pattern = "B|J")) %>% left_join(nstudents)

nstudents.year.semester <- combine.feat %>% group_by(year, semester) %>% summarise(n.students = length(unique(id_student)))

vle.combine.sum.sum <- vle.combine.sum %>% group_by(activity_type, year, semester) %>% 
    summarise(n.clicks = sum(sum_click)) %>% ungroup() %>% left_join(nstudents.year.semester) %>% 
    mutate(n.clicks.std = n.clicks/n.students) %>% mutate(activity_type = fct_reorder(activity_type, 
    n.clicks.std))

ggplot(vle.combine.sum.sum, aes(x = activity_type, y = n.clicks.std, fill = year)) + 
    geom_bar(stat = "identity", position = "dodge") + facet_wrap(~semester) + 
    coord_flip() + theme_bw()

  • AAA has got very little students
vle.combine.top5 <- vle.combine.sum %>% filter(activity_type %in% c("oucontent", 
    "forumng", "homepage", "quiz", "subpage"))

vle.combine.sum.sum <- vle.combine.top5 %>% group_by(activity_type, semester, 
    code_module, n.students) %>% summarise(n.clicks = sum(sum_click)/unique(n.students)) %>% 
    unite(key, activity_type, semester, code_module, sep = "-", remove = FALSE) %>% 
    ungroup() %>% mutate(activity_type = fct_reorder(activity_type, n.clicks))


ggplot(vle.combine.sum.sum, aes(x = code_module, y = n.clicks, fill = semester)) + 
    geom_bar(stat = "identity", position = "dodge", colour = "white") + facet_wrap(~activity_type) + 
    theme_bw() + coord_flip()

# Sum of clicks done by each student per activity type
click.students.activity <- vle.combine.sum %>% filter(activity_type %in% c("oucontent", 
    "forumng", "homepage", "quiz", "subpage")) %>% group_by(code_module, year, 
    semester, activity_type, id_student) %>% summarise(n.clicks.act = sum(sum_click))

n.students.vle <- click.students.activity %>% group_by(code_module, year, semester) %>% 
    summarise(n.students.clicks = length(unique(id_student)))

click.students.activity <- click.students.activity %>% left_join(n.students.vle) %>% 
    mutate(clicks.per.student = n.clicks.act/n.students.clicks)

8 Describe modules and students attending each module

  • Across two year there were two presentations in each year
  • Module AAA is the one with less student, probably aroun 400. Module CCC in 2014 is the one attracting mode students
  • There seem to be a semester effect in the number students, with more students enrolling in the J part of the year
sum.students <- combine.feat %>% group_by(code_presentation, code_module) %>% 
    summarise(n.students = length(unique(id_student))) %>% ungroup() %>% mutate(year = str_remove(code_presentation, 
    pattern = "B|J"), semester = str_extract(code_presentation, pattern = "B|J"), 
    code_presentation = factor(code_presentation, levels = c("2013B", "2014B", 
        "2013J", "2014J"))) %>% dplyr::rename(`Code presentation` = code_presentation)


# There seem to be a semester effect in the number students, with more
# students enrolling in the J
ggplot(sum.students, aes(x = code_module, y = n.students, fill = `Code presentation`)) + 
    geom_bar(stat = "identity", position = "dodge", colour = "white") + theme_bw() + 
    ggtitle("Number of students enrolled in each module") + theme(legend.position = "bottom") + 
    labs(x = "Code of module", y = "Number of student enrolled")
Number of students enrolled in each module across years and presentations. The figure included students who enrolled multiple time (same or different module).

Number of students enrolled in each module across years and presentations. The figure included students who enrolled multiple time (same or different module).

# There doesn't seem to be a year effect
ggplot(sum.students, aes(x = code_module, y = n.students, fill = year)) + facet_wrap(~semester) + 
    geom_bar(stat = "identity", position = "dodge", colour = "white") + theme_bw() + 
    ggtitle("Number of students enrolled in each module")

  • The score does not look super discriminant. Am I using it correctly?
ggplot(combine.feat.unique, aes(x = comb.score, fill = final_result)) + geom_density() + 
    theme_bw()

9 Exploratory: factors that influence dropout and success

We defined the success rate as the proportion of students that either passed a module with Distinction or Pass out of all the students that registered into a module but excluding students who Withdrawn. Conversely, we defined the dropout rate as the proportion of students that Withdrawn from a module out of all the students that registered into any module. The reasons that push a student to dropout from a module could be very different from the reasons that led to failure.

combine.feat.unique <- combine.feat.unique %>% mutate(pass_fail = ifelse(final_result %in% 
    c("Fail"), 0, 1)) %>% mutate(pass_fail = ifelse(final_result %in% "Withdrawn", 
    NA, pass_fail))

table(combine.feat.unique$pass_fail, combine.feat.unique$final_result)
##    
##     Distinction  Fail  Pass Withdrawn
##   0           0  6548     0         0
##   1        2684     0 11119         0
table(combine.feat.unique$withdraw, combine.feat.unique$final_result)
##    
##     Distinction  Fail  Pass Withdrawn
##   0        2684  6548 11119         0
##   1           0     0     0      8397
# Poeple that withdraw tend to have a slightly lower socio economic statis
ggplot(combine.feat.unique,aes(x = factor(withdraw), y = imd_band_num)) + geom_boxplot() + facet_wrap(~semester) + theme_bw()

# Scotland, wales and ireland are the ones with lower levels of widthdrawal
sum.region.with <- combine.feat.unique %>%
  group_by(region) %>%
  summarise(imd.mean = mean(imd_band_num,na.rm = TRUE),
            sem.imd.mean = sd(imd_band_num,na.rm = TRUE)/sqrt(length(imd_band_num[!is.na(imd_band_num)])),
            
            with.mean = mean(withdraw),
            sem.with.mean = sd(withdraw)/sqrt(length(withdraw))) %>%
  mutate(region = fct_reorder(region,with.mean))

ggplot(sum.region.with,aes(x = with.mean, y = imd.mean,colour=region)) + geom_point(stat = "identity")  + theme_bw() +
  geom_errorbar(aes(ymin = imd.mean - sem.imd.mean, 
                    ymax = imd.mean + sem.imd.mean)) +
  geom_errorbarh(aes( xmin = with.mean - sem.with.mean, 
                    xmax = with.mean + sem.with.mean)) +
  geom_label_repel(aes(label = region)) +
  geom_hline(yintercept = mean(combine.feat.unique$imd_band_num,na.rm = TRUE),linetype="dotted") + ylim(c(3.5,7.5)) +
   geom_vline(xintercept = mean(combine.feat.unique$withdraw,na.rm = TRUE),linetype="dotted") 

  • I will use module BBB as reference for the module, it’s around the mean of withdrawal

  • The module with higheest prop of withdrawal are CCC/DDD and the ones with less withdrawal are AAA and GGG. The socio econoic area does not seem to be too heterogenous apart from module AAA.

sum.region <- combine.feat.unique %>%
  group_by(code_module,semester,year) %>%
  summarise(imd.mean = mean(imd_band_num,na.rm = TRUE),
            sem.imd.mean = sd(imd_band_num,na.rm = TRUE)/sqrt(length(imd_band_num[!is.na(imd_band_num)])),
            
            with.mean = mean(withdraw),
            sem.with.mean = sd(withdraw)/sqrt(length(withdraw))) %>%
  ungroup() %>%
  mutate(code_module = fct_reorder(code_module,with.mean))

ggplot(sum.region,aes(x = with.mean, y = imd.mean,colour=code_module,shape=year)) + geom_point(stat = "identity",size=2)  + theme_bw() +
  geom_errorbar(aes(ymin = imd.mean - sem.imd.mean, 
                    ymax = imd.mean + sem.imd.mean)) +
  geom_errorbarh(aes( xmin = with.mean - sem.with.mean, 
                    xmax = with.mean + sem.with.mean)) +
  geom_label_repel(aes(label = code_module)) + facet_wrap(~semester) +
  geom_hline(yintercept = mean(combine.feat.unique$imd_band_num,na.rm = TRUE),linetype="dotted") + ylim(c(3.5,7.5))+
   geom_vline(xintercept = mean(combine.feat.unique$withdraw,na.rm = TRUE),linetype="dotted") 

sum.region <- combine.feat.unique %>% group_by(code_module, semester, year) %>% 
    summarise(with.mean = mean(withdraw), sem.with.mean = sd(withdraw)/sqrt(length(withdraw))) %>% 
    ungroup() %>% mutate(code_module = fct_reorder(code_module, with.mean))

ggplot(sum.region, aes(x = code_module, y = with.mean, colour = code_module, 
    shape = year)) + geom_point(stat = "identity", size = 2) + theme_bw() + 
    geom_errorbar(aes(ymin = with.mean - sem.with.mean, ymax = with.mean + sem.with.mean), 
        width = 0.5) + facet_wrap(~semester) + geom_hline(yintercept = mean(combine.feat.unique$withdraw, 
    na.rm = TRUE), linetype = "dotted")

9.1 Socio-economical factors

# IMD bands 
imd.band <- unique(combine.feat.unique[,c("imd_band","imd_band_num")]) %>%
  filter(!is.na(imd_band)) %>%
  mutate(imd_band = ifelse(imd_band %in% "10-20","10-20%",imd_band),
         imd.mean = imd_band_num)

# Scotland, wales and ireland are the ones with lower levels of widthdrawal
sum.region.pass.fail <- combine.feat.unique %>%
  group_by(region) %>%
  summarise(imd.mean = round(mean(imd_band_num,na.rm = TRUE)),
            sem.imd.mean = sd(imd_band_num,na.rm = TRUE)/sqrt(length(imd_band_num[!is.na(imd_band_num)])),
            
            with.mean = mean(withdraw,na.rm = TRUE),
            sem.with.mean = sd(withdraw,na.rm = TRUE)/sqrt(length(withdraw[!is.na(withdraw)])),
            
            pass.mean = mean(pass_fail,na.rm = TRUE),
            sem.pass.mean = sd(pass_fail,na.rm = TRUE)/sqrt(length(pass_fail[!is.na(pass_fail)]))) %>%
  mutate(region = fct_reorder(region,pass.mean)) %>%
 left_join(imd.band) %>%
  mutate(Region = paste0(region," (IMD",imd_band,")")) %>%
  dplyr::rename(IMD_band = "imd_band")

plot.region =ggplot(sum.region.pass.fail,aes(x = pass.mean, y = with.mean,colour=IMD_band)) + geom_point()  + theme_classic() +
  geom_vline(xintercept = mean(combine.feat.unique$pass_fail,na.rm = TRUE),linetype="dotted",colour="grey") + 
   geom_hline(yintercept = mean(combine.feat.unique$withdraw,na.rm = TRUE),linetype="dotted",colour="grey") + 
  geom_label_repel(aes(label = Region),show.legend = FALSE,size=2)  + labs(x = "Success rate", y = "Dropout rate") + theme(legend.position = "bottom")
  • I will use module BBB as reference for the module, it’s around the mean of withdrawal

  • The module with higheest prop of withdrawal are CCC/DDD and the ones with less withdrawal are AAA and GGG. The socio econoic area does not seem to be too heterogenous apart from module AAA.

sum.module.with <- combine.feat.unique %>% group_by(code_module) %>% summarise(with.mean = mean(withdraw, 
    na.rm = TRUE), sem.with.mean = sd(withdraw, na.rm = TRUE)/sqrt(length(withdraw[!is.na(withdraw)]))) %>% 
    ungroup() %>% mutate(code_module = fct_reorder(code_module, with.mean))

# Scotland, wales and ireland are the ones with lower levels of widthdrawal
sum.region.pass.fail <- combine.feat.unique %>% group_by(code_module) %>% summarise(pass.mean = mean(pass_fail, 
    na.rm = TRUE), sem.pass.mean = sd(pass_fail, na.rm = TRUE)/sqrt(length(pass_fail[!is.na(pass_fail)])), 
    imd.mean = round(mean(imd_band_num, na.rm = TRUE))) %>% mutate(code_module = fct_reorder(code_module, 
    pass.mean)) %>% left_join(sum.module.with) %>% left_join(imd.band) %>% mutate(Module = paste0(code_module, 
    " (IMD", imd_band, ")")) %>% dplyr::rename(IMD_band = imd_band)


data.overlay <- data.frame(success.rate = c(mean(combine.feat.unique$pass_fail, 
    na.rm = TRUE), 0.2), dropout.rate = c(0.2, mean(combine.feat.unique$withdraw, 
    na.rm = TRUE)), label = c("Average success rate", "Average dropout rate"))

plot.module = ggplot(sum.region.pass.fail, aes(x = pass.mean, y = with.mean, 
    colour = IMD_band)) + geom_point() + theme_classic() + geom_vline(xintercept = mean(combine.feat.unique$pass_fail, 
    na.rm = TRUE), linetype = "dotted", colour = "grey") + geom_hline(yintercept = mean(combine.feat.unique$withdraw, 
    na.rm = TRUE), linetype = "dotted", colour = "grey") + geom_label_repel(aes(label = Module), 
    show.legend = FALSE, size = 2) + labs(x = "Success rate", y = "Dropout rate") + 
    theme(legend.position = "bottom") + xlim(0.55, 0.9) + ylim(0.1, 0.45)
legend <- get_legend(plot.region)
pc1 = plot_grid(plot.region + ggtitle("Success/dropout rate\nby region") + theme(legend.position = "none") + 
    xlim(0.55, 0.9) + ylim(0.1, 0.45), plot.module + ggtitle("Success/dropout rate\nby module") + 
    theme(legend.position = "none") + xlim(0.55, 0.9) + ylim(0.1, 0.45), nrow = 1)

figure1 = plot_grid(pc1)
figure1

ggsave("output/success-dropout-region-module.pdf", width = 12, height = 6)

9.2 Success in modules across years

sum.module.with <- combine.feat.unique %>% group_by(code_module, semester, year) %>% 
    summarise(with.mean = mean(withdraw, na.rm = TRUE), sem.with.mean = sd(withdraw, 
        na.rm = TRUE)/sqrt(length(withdraw[!is.na(withdraw)]))) %>% ungroup() %>% 
    mutate(code_module = fct_reorder(code_module, with.mean))

sum.region.pass.fail <- combine.feat.unique %>% group_by(code_module, semester, 
    year) %>% summarise(pass.mean = mean(pass_fail, na.rm = TRUE), sem.pass.mean = sd(pass_fail, 
    na.rm = TRUE)/sqrt(length(pass_fail[!is.na(pass_fail)]))) %>% ungroup() %>% 
    mutate(code_module = fct_reorder(code_module, pass.mean)) %>% left_join(sum.module.with)


ggplot(sum.region.pass.fail, aes(x = pass.mean, y = with.mean, colour = code_module, 
    shape = year)) + geom_point(stat = "identity", size = 2) + theme_bw() + 
    geom_text_repel(aes(label = code_module)) + facet_wrap(~semester) + geom_hline(yintercept = mean(combine.feat.unique$withdraw, 
    na.rm = TRUE), linetype = "dotted") + geom_vline(xintercept = mean(combine.feat.unique$pass_fail, 
    na.rm = TRUE), linetype = "dotted")

ggsave("output/dropout-success-module-semester-year.png", width = 10, height = 10)

9.3 Education

# Scotland, wales and ireland are the ones with lower levels of widthdrawal
sum.edu.pass.fail <- combine.feat.unique %>%
  group_by(highest_education) %>%
  summarise(with.mean = mean(withdraw,na.rm = TRUE),
            sem.with.mean = sd(withdraw,na.rm = TRUE)/sqrt(length(withdraw[!is.na(withdraw)])),
            
    pass.mean = mean(pass_fail,na.rm = TRUE),
            sem.pass.mean = sd(pass_fail,na.rm = TRUE)/sqrt(length(pass_fail[!is.na(pass_fail)])),
    
            imd.mean = round(mean(imd_band_num,na.rm = TRUE))) %>%
  mutate(highest_education = fct_reorder(highest_education,pass.mean)) %>%
   left_join(imd.band) %>%
  mutate(Highest_education = paste0(highest_education," (IMD",imd_band,")")) %>%
  dplyr::rename(IMD_band = imd_band)


p4 = ggplot(sum.edu.pass.fail,aes(x = pass.mean, y = with.mean,colour=IMD_band)) + geom_point()  + theme_classic() +
  geom_vline(xintercept = mean(combine.feat.unique$pass_fail,na.rm = TRUE),linetype="dotted",colour="grey") + 
   geom_hline(yintercept = mean(combine.feat.unique$withdraw,na.rm = TRUE),linetype="dotted",colour="grey") + 
  geom_text_repel(aes(label = Highest_education),show.legend = FALSE,size=2)  + labs(x = "Success rate", y = "Dropout rate") + theme(legend.position = "bottom") 

9.4 Lineplots final results and demographics

col.props <- data.frame(prop.table(table(combine.feat.unique$final_result, combine.feat.unique$imd_band), 
    margin = 2)) %>% dplyr::rename(`Final result` = Var1) %>% mutate(`Final result` = factor(`Final result`, 
    levels = c("Distinction", "Pass", "Fail", "Withdrawn"))) %>% mutate(Var2 = factor(Var2, 
    levels = c("0-10%", "10-20", "20-30%", "30-40%", "40-50%", "50-60%", "60-70%", 
        "70-80%", "80-90%", "90-100%"), labels = c(c("0-10", "10-20", "20-30", 
        "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"))))

plot.imd = ggplot(col.props, aes(x = Var2, y = Freq * 100, colour = `Final result`, 
    group = `Final result`)) + geom_point() + geom_line() + theme_bw() + labs(x = "Band of Index of Multiple Depravation (%)", 
    y = "% Students") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
    theme(axis.text.x = element_text(angle = 5, hjust = 0.5, size = 7), legend.key.size = unit(0.3, 
        "cm"), legend.key.width = unit(0.05, "cm"), axis.title = element_text(size = 9), 
        title = element_text(size = 10), legend.position = "bottom")


col.props <- data.frame(prop.table(table(combine.feat.unique$highest_education, 
    combine.feat.unique$age_band), margin = 2))

p2 = ggplot(col.props, aes(x = Var2, y = Freq * 100, colour = Var1, group = Var1)) + 
    geom_point() + geom_line() + theme_bw() + labs(x = "Age class", y = "% Students") + 
    theme(legend.position = "bottom")

# education
col.props <- data.frame(prop.table(table(combine.feat.unique$final_result, combine.feat.unique$highest_education), 
    margin = 2)) %>% dplyr::rename(`Final result` = Var1) %>% mutate(`Final result` = factor(`Final result`, 
    levels = c("Distinction", "Pass", "Fail", "Withdrawn"))) %>% mutate(Var2 = factor(Var2, 
    levels = levels(Var2), labels = c("No formal qual", "< Than A", "A or equivalent", 
        "HE", "Post Graduate")))

plot.edu = ggplot(col.props, aes(x = Var2, y = Freq * 100, colour = `Final result`, 
    group = `Final result`)) + geom_point() + geom_line() + theme_bw() + labs(x = "Highest education", 
    y = "% Students") + theme(legend.position = "bottom") + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1)) + theme(axis.text.x = element_text(angle = 5, hjust = 0.5, size = 7), 
    legend.key.size = unit(0.3, "cm"), legend.key.width = unit(0.05, "cm"), 
    axis.title = element_text(size = 9), title = element_text(size = 10))
p = plot_grid(plot.imd + theme(legend.position = "none"), plot.edu + theme(legend.position = "none"), 
    nrow = 1)
legend <- get_legend(plot.imd)
plot_grid(p, legend, nrow = 2, rel_heights = c(5, 1))

ggsave("output/imd-education-final-result.pdf", width = 8, height = 3)
col.props <- data.frame(prop.table(table(combine.feat.unique$imd_band, combine.feat.unique$highest_education), 
    margin = 2)) %>% mutate(Var1 = factor(Var1, levels = c("0-10%", "10-20", 
    "20-30%", "30-40%", "40-50%", "50-60%", "60-70%", "70-80%", "80-90%", "90-100%"), 
    labels = c(c("0-10%", "10-20%", "20-30%", "30-40%", "40-50%", "50-60%", 
        "60-70%", "70-80%", "80-90%", "90-100%")))) %>% dplyr::rename(`IMD band` = Var1) %>% 
    mutate(Var2 = factor(Var2, levels = levels(Var2), labels = c("No formal qual", 
        "< Than A", "A or equivalent", "HE", "Post Graduate")))

plot.imd.edu <- ggplot(col.props, aes(x = Var2, y = Freq * 100, colour = `IMD band`, 
    group = `IMD band`)) + geom_point() + geom_line() + theme_bw() + labs(x = "Highest education", 
    y = "% Students") + ggtitle("Qualification by IMD band") + theme(axis.text.x = element_text(angle = 5, 
    hjust = 0.5, size = 7), legend.key.size = unit(0.3, "cm"), legend.key.width = unit(0.05, 
    "cm"), axis.title = element_text(size = 9), title = element_text(size = 10))

9.5 Gender

# Scotland, wales and ireland are the ones with lower levels of widthdrawal
sum.edu.pass.fail <- combine.feat.unique %>%
  group_by(gender) %>%
  summarise(with.mean = mean(withdraw,na.rm = TRUE),
            sem.with.mean = sd(withdraw,na.rm = TRUE)/sqrt(length(withdraw[!is.na(withdraw)])),
            
    pass.mean = mean(pass_fail,na.rm = TRUE),
            sem.pass.mean = sd(pass_fail,na.rm = TRUE)/sqrt(length(pass_fail[!is.na(pass_fail)])),
    
            imd.mean = round(mean(imd_band_num,na.rm = TRUE))) %>%
  mutate(gender = fct_reorder(gender,pass.mean)) %>%
  dplyr::rename(Success = pass.mean,
                Dropout = with.mean) %>%
  dplyr::select(gender,Success,Dropout) %>%
  gather(key = Result, value = Proportion,Success,Dropout)


plot.gender=ggplot(sum.edu.pass.fail,aes(x = gender, y = Proportion,fill=Result)) + geom_bar(stat="identity",position = "dodge")  + theme_classic()+ 
  theme(axis.text.x = element_text(angle = 5, hjust = 0.5,size=7),
         legend.key.size = unit(0.3, "cm"),
  axis.title = element_text(size=9),
  title = element_text(size=10))
plot_grid(plot.imd.edu, plot.gender + ggtitle("Dropout/Success rate by gender"), 
    nrow = 1)

ggsave("output/imd-by-education.pdf", width = 8, height = 3)

9.6 VLE: dropout/success

  • What’s the frequency of final_results among people that clicked and not clicked?
combine.feat.clicks <- combine.feat.unique %>% left_join(click.students.activity) %>% 
    mutate(final_result = factor(final_result, levels = c("Distinction", "Pass", 
        "Fail", "Withdrawn")))

freq.results <- combine.feat.clicks %>% group_by(id_student, final_result) %>% 
    summarise(n.clicks = round(mean(n.clicks.act, na.rm = TRUE))) %>% mutate(VLE = ifelse(is.na(n.clicks), 
    "Never used VLE", "Used VLE")) %>% group_by(VLE, final_result) %>% summarise(n = n())
plot.freq.vle = ggplot(freq.results, aes(x = final_result, y = n, fill = VLE)) + 
    geom_bar(stat = "identity", position = "dodge") + theme_classic() + geom_text(aes(label = round(n)), 
    hjust = 0.5, vjust = -1, size = 2, position = position_dodge(width = 0.9)) + 
    labs(x = "Final result", y = "Number of students") + theme(axis.text.x = element_text(angle = 0, 
    hjust = 0.5, size = 6), axis.text.y = element_text(angle = 0, hjust = 0.5, 
    size = 6), legend.key.size = unit(0.3, "cm"), legend.key.width = unit(0.3, 
    "cm"), legend.text = element_text(size = 5), axis.title = element_text(size = 6), 
    title = element_text(size = 8), legend.position = "top")

plot.freq.vle

ggsave("output/usedVLE-by-result.pdf", width = 3, height = 3)
freq.results <- combine.feat.clicks %>% group_by(id_student, final_result) %>% 
    summarise(n.clicks = mean(n.clicks.act, na.rm = TRUE)) %>% mutate(VLE = ifelse(is.na(n.clicks), 
    "Never used VLE", "Used VLE"))

# Chi square
chisq.test(table(freq.results$final_result, freq.results$VLE))
## 
##  Pearson's Chi-squared test
## 
## data:  table(freq.results$final_result, freq.results$VLE)
## X-squared = 5684.8, df = 3, p-value < 2.2e-16
n.final.result <- combine.feat.unique %>% group_by(final_result) %>% summarise(n = n())

# sum all clicks done for each result and activity type and divide by number
# of students in each result category
result.clicks <- combine.feat.clicks %>% group_by(final_result, activity_type) %>% 
    left_join(n.final.result) %>% summarise(sum.clicks = round(sum(n.clicks.act, 
    na.rm = TRUE))/unique(n)) %>% filter(!is.na(activity_type)) %>% ungroup() %>% 
    mutate(activity_type = fct_reorder(activity_type, sum.clicks)) %>% mutate(final_result = factor(final_result, 
    levels = c("Distinction", "Pass", "Fail", "Withdrawn"))) %>% filter(!(activity_type %in% 
    c("ouwiki", "resource")))

top5 = ggplot(result.clicks, aes(x = final_result, y = sum.clicks, fill = activity_type)) + 
    geom_bar(stat = "identity", position = "dodge", colour = "white") + theme_classic() + 
    theme(legend.position = "bottom") + labs(x = "Final result", y = "Sum of clicks per activity type") + 
    geom_text(aes(label = round(sum.clicks)), hjust = 0.5, vjust = -1, size = 2, 
        position = position_dodge(width = 0.9)) + theme(axis.text.x = element_text(angle = 0, 
    hjust = 0.5, size = 6), axis.text.y = element_text(angle = 0, hjust = 0.5, 
    size = 6), legend.key.size = unit(0.3, "cm"), legend.key.width = unit(0.3, 
    "cm"), axis.title = element_text(size = 6), title = element_text(size = 8), 
    legend.position = "bottom")
ggdraw() + draw_plot(top5 + theme(legend.position = "bottom"), 0, 0, 1, 1) + 
    draw_plot(plot.freq.vle + theme(legend.position = "top"), x = 0.53, y = 0.55, 
        width = 0.5, height = 0.4, scale = 0.9) + draw_plot_label(c("A", "B"), 
    c(0, 0.58), c(1, 0.92), size = 10)

ggsave("output/usedVLE-by-result-top5.pdf", width = 5, height = 4)
  • quiz and outcontent most fruitful
# correlation
combine.feat.unique.num <- combine.feat.clicks %>% spread(key = activity_type, 
    value = n.clicks.act) %>% mutate_at(.vars = c("oucontent", "quiz", "homepage", 
    "subpage"), .funs = function(x) log(x)) %>% dplyr::select(pass_fail, oucontent, 
    quiz, homepage, subpage, disability, forumng, semester, year, gender, code_module) %>% 
    mutate_all(.funs = function(x) as.numeric(as.factor(x)))

corrplot::corrplot(cor(combine.feat.unique.num, use = "pairwise.complete.obs"))

combine.feat.unique.num <- combine.feat.clicks %>% spread(key = activity_type, 
    value = n.clicks.act)

apply(combine.feat.unique.num, 2, function(x) sum(is.na(x)))
##         key.student.module                code_module 
##                          0                          0 
##          code_presentation                 id_student 
##                          0                          0 
##                     gender                     region 
##                          0                          0 
##          highest_education                   imd_band 
##                          0                       4255 
##                   age_band       num_of_prev_attempts 
##                          0                          0 
##            studied_credits                 disability 
##                          0                          0 
##               final_result          date_registration 
##                          0                          0 
##        date_unregistration module_presentation_length 
##                      91644                          0 
##               Unregistered                 Registered 
##                          0                          0 
##                 comb.score               total.clicks 
##                      12094                      12140 
##                   withdraw               imd_band_num 
##                          0                       4255 
##  num_of_prev_attempts_dich                       year 
##                          0                          0 
##                   semester                 n.students 
##                          0                          0 
##                  pass_fail          n.students.clicks 
##                      25237                       2994 
##         clicks.per.student                    forumng 
##                       2994                      93754 
##                   homepage                  oucontent 
##                      90881                      92986 
##                       quiz                    subpage 
##                      98758                      91718 
##                       <NA> 
##                     116606
# remove homepage
m <- glm(pass_fail ~ disability + log(oucontent) + semester + year + gender + 
    code_module, data = combine.feat.unique.num, family = binomial)

car::vif(m)
##                    GVIF Df GVIF^(1/(2*Df))
## disability     1.007914  1        1.003949
## log(oucontent) 2.522731  1        1.588311
## semester       1.084754  1        1.041515
## year           1.182951  1        1.087636
## gender         1.547445  1        1.243963
## code_module    3.985378  6        1.122120
tidym <- tidy(m, conf.int = TRUE)

tidym <- tidym %>% mutate(term = factor(term, levels = term)) %>% filter(term != 
    "(Intercept)")

ggplot(tidym, aes(x = term, y = estimate, colour = -log10(p.value))) + geom_point() + 
    geom_errorbar(aes(ymin = conf.low, ymax = conf.high)) + theme_bw() + geom_hline(yintercept = 0, 
    linetype = "dotted") + coord_flip() + ggtitle("Clicks associated with success vs failure")

sum.region <- combine.feat.clicks %>%
  group_by(code_module,semester) %>%
  summarise(clicks.mean = mean(n.clicks.act,na.rm = TRUE),
            sem.clicks.mean = sd(n.clicks.act,na.rm = TRUE)/sqrt(length(n.clicks.act[!is.na(n.clicks.act)])),
            
            with.mean = mean(withdraw),
            sem.with.mean = sd(withdraw)/sqrt(length(withdraw))) %>%
  ungroup() %>%
  mutate(code_module = fct_reorder(code_module,with.mean))

ggplot(sum.region,aes(x = with.mean, y = clicks.mean,colour=code_module)) + geom_point(stat = "identity")  + theme_bw() +
  geom_errorbar(aes(ymin = clicks.mean - sem.clicks.mean, 
                    ymax = clicks.mean + sem.clicks.mean)) +
  geom_errorbarh(aes( xmin = with.mean - sem.with.mean, 
                    xmax = with.mean + sem.with.mean)) +
  geom_label_repel(aes(label = code_module)) + facet_wrap(~semester) + labs(x="Dropout rate",y="Mean of clicks")

10 Logistic on withdral including both B and J together

  • East anglian region as reference
  • BBB as reference module since it is more around the average withdraw/success while AAA is very extreme (high success and lower dropout)
  • I am going to randomly take one observation from students with multiple courses

10.1 Combined semester

# centered on the mean
combine.feat.unique$comb.score.std <- scale(combine.feat.unique$comb.score,center = TRUE,scale = TRUE) 
combine.feat.unique$total.clicks.std <- scale(log(combine.feat.unique$total.clicks),center = TRUE,scale = TRUE)
combine.feat.unique$studied_credits.std <- scale(log(combine.feat.unique$studied_credits),center = TRUE,scale = TRUE)

par(mfrow=c(2,2))
hist(combine.feat.unique$comb.score.std)
hist(combine.feat.unique$total.clicks.std)
hist(combine.feat.unique$studied_credits.std)

# Recode modules
regions <- names(table(combine.feat.unique$region))[!(names(table(combine.feat.unique$region)) %in% "South West Region")]
combine.feat.unique <- combine.feat.unique %>%
  mutate(code_module = factor(code_module, 
                              levels = c("BBB","AAA","CCC","DDD","EEE","FFF","GGG")),
        
          region = factor(region, levels=c("South West Region",regions)))

  • Very bad collinearity
# No imd
m <- glm(withdraw ~ gender + region + highest_education + studied_credits.std + 
    disability + module_presentation_length + code_module + semester + year + 
    total.clicks.std + n.students + num_of_prev_attempts_dich + comb.score.std, 
    data = combine.feat.unique, family = binomial)

car::vif(m)
##                                  GVIF Df GVIF^(1/(2*Df))
## gender                       1.488549  1        1.220061
## region                       1.153646 12        1.005973
## highest_education            1.180044  4        1.020910
## studied_credits.std          1.238350  1        1.112812
## disability                   1.030869  1        1.015317
## module_presentation_length  41.619049  1        6.451283
## code_module                151.757253  6        1.519716
## semester                    47.462224  1        6.889283
## year                         1.229218  1        1.108701
## total.clicks.std             1.487774  1        1.219743
## n.students                  37.680235  1        6.138423
## num_of_prev_attempts_dich    1.092203  1        1.045085
## comb.score.std               1.190036  1        1.090888

A few independent variables show quite high VIF which is symptoms of multicollinearity. This makes sense because the number of students and length of a course are correlated with the module.

ggplot(combine.feat.unique, aes(x = code_module, y = n.students, fill = semester)) + 
    geom_bar(stat = "identity", position = "dodge") + theme_bw()

  • Remove n.students and module_presentation_length
m <- glm(withdraw ~ gender + region + highest_education + studied_credits.std + 
    disability + code_module + semester + year + total.clicks.std + num_of_prev_attempts_dich + 
    comb.score.std + imd_band_num, data = combine.feat.unique, family = binomial)

car::vif(m)  # looks better
##                               GVIF Df GVIF^(1/(2*Df))
## gender                    1.500246  1        1.224845
## region                    1.293690 12        1.010787
## highest_education         1.182264  4        1.021149
## studied_credits.std       1.243820  1        1.115267
## disability                1.035426  1        1.017559
## code_module               3.113794  6        1.099278
## semester                  1.027967  1        1.013887
## year                      1.172279  1        1.082718
## total.clicks.std          1.477427  1        1.215495
## num_of_prev_attempts_dich 1.090450  1        1.044246
## comb.score.std            1.189781  1        1.090771
## imd_band_num              1.176506  1        1.084669
summary(m)
## 
## Call:
## glm(formula = withdraw ~ gender + region + highest_education + 
##     studied_credits.std + disability + code_module + semester + 
##     year + total.clicks.std + num_of_prev_attempts_dich + comb.score.std + 
##     imd_band_num, family = binomial, data = combine.feat.unique)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8468  -0.5496  -0.3740  -0.2312   3.2960  
## 
## Coefficients:
##                                               Estimate Std. Error z value
## (Intercept)                                  -2.880549   0.234868 -12.265
## genderM                                      -0.331087   0.050414  -6.567
## regionEast Anglian Region                    -0.043346   0.096038  -0.451
## regionEast Midlands Region                    0.035555   0.103088   0.345
## regionIreland                                -0.224889   0.139314  -1.614
## regionLondon Region                          -0.182070   0.099028  -1.839
## regionNorth Region                            0.130146   0.131614   0.989
## regionNorth Western Region                   -0.077487   0.099717  -0.777
## regionScotland                               -0.082472   0.096673  -0.853
## regionSouth East Region                       0.060541   0.106342   0.569
## regionSouth Region                            0.040304   0.097552   0.413
## regionWales                                  -0.317189   0.108236  -2.931
## regionWest Midlands Region                    0.023064   0.102013   0.226
## regionYorkshire Region                       -0.044575   0.109551  -0.407
## highest_educationLower Than A Level          -0.108241   0.210938  -0.513
## highest_educationA Level or Equivalent       -0.455266   0.211385  -2.154
## highest_educationHE Qualification            -0.323237   0.217010  -1.490
## highest_educationPost Graduate Qualification -0.943426   0.396998  -2.376
## studied_credits.std                           0.150688   0.023956   6.290
## disabilityY                                   0.316934   0.065284   4.855
## code_moduleAAA                                0.577660   0.152994   3.776
## code_moduleCCC                                1.744103   0.078017  22.355
## code_moduleDDD                                1.143262   0.069550  16.438
## code_moduleEEE                                0.991637   0.119761   8.280
## code_moduleFFF                                1.723407   0.077907  22.121
## code_moduleGGG                               -0.967554   0.125629  -7.702
## semesterJ                                     0.109779   0.042983   2.554
## year2014                                      0.309914   0.046808   6.621
## total.clicks.std                             -1.014747   0.024861 -40.816
## num_of_prev_attempts_dich0                    0.366722   0.062056   5.910
## comb.score.std                               -0.196460   0.019897  -9.874
## imd_band_num                                 -0.021280   0.007914  -2.689
##                                              Pr(>|z|)    
## (Intercept)                                   < 2e-16 ***
## genderM                                      5.12e-11 ***
## regionEast Anglian Region                     0.65175    
## regionEast Midlands Region                    0.73017    
## regionIreland                                 0.10647    
## regionLondon Region                           0.06598 .  
## regionNorth Region                            0.32274    
## regionNorth Western Region                    0.43712    
## regionScotland                                0.39360    
## regionSouth East Region                       0.56915    
## regionSouth Region                            0.67950    
## regionWales                                   0.00338 ** 
## regionWest Midlands Region                    0.82113    
## regionYorkshire Region                        0.68409    
## highest_educationLower Than A Level           0.60785    
## highest_educationA Level or Equivalent        0.03126 *  
## highest_educationHE Qualification             0.13635    
## highest_educationPost Graduate Qualification  0.01748 *  
## studied_credits.std                          3.17e-10 ***
## disabilityY                                  1.21e-06 ***
## code_moduleAAA                                0.00016 ***
## code_moduleCCC                                < 2e-16 ***
## code_moduleDDD                                < 2e-16 ***
## code_moduleEEE                                < 2e-16 ***
## code_moduleFFF                                < 2e-16 ***
## code_moduleGGG                               1.34e-14 ***
## semesterJ                                     0.01065 *  
## year2014                                     3.57e-11 ***
## total.clicks.std                              < 2e-16 ***
## num_of_prev_attempts_dich0                   3.43e-09 ***
## comb.score.std                                < 2e-16 ***
## imd_band_num                                  0.00717 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19524  on 21944  degrees of freedom
## Residual deviance: 15666  on 21913  degrees of freedom
##   (6803 observations deleted due to missingness)
## AIC: 15730
## 
## Number of Fisher Scoring iterations: 5
tidym.combined <- tidy(m, conf.int = TRUE) %>% mutate(term = factor(term, levels = term)) %>% 
    filter(term != "(Intercept)") %>% mutate(OR = exp(estimate))
knitr::kable(tidym.combined)
term estimate std.error statistic p.value conf.low conf.high OR
genderM -0.3310866 0.0504144 -6.5673057 0.0000000 -0.4298784 -0.2322407 0.7181430
regionEast Anglian Region -0.0433456 0.0960380 -0.4513378 0.6517461 -0.2313052 0.1452697 0.9575804
regionEast Midlands Region 0.0355552 0.1030878 0.3449019 0.7301681 -0.1666407 0.2375880 1.0361948
regionIreland -0.2248888 0.1393143 -1.6142541 0.1064723 -0.5013415 0.0451573 0.7986051
regionLondon Region -0.1820697 0.0990277 -1.8385743 0.0659778 -0.3760514 0.0122457 0.8335432
regionNorth Region 0.1301461 0.1316142 0.9888454 0.3227388 -0.1298580 0.3863345 1.1389948
regionNorth Western Region -0.0774865 0.0997171 -0.7770638 0.4371211 -0.2728155 0.1181822 0.9254395
regionScotland -0.0824725 0.0966732 -0.8531059 0.3936006 -0.2716398 0.1074171 0.9208368
regionSouth East Region 0.0605412 0.1063422 0.5693053 0.5691490 -0.1482262 0.2687701 1.0624114
regionSouth Region 0.0403038 0.0975523 0.4131504 0.6794964 -0.1506620 0.2318489 1.0411270
regionWales -0.3171893 0.1082359 -2.9305372 0.0033838 -0.5299166 -0.1054878 0.7281929
regionWest Midlands Region 0.0230638 0.1020129 0.2260874 0.8211334 -0.1769180 0.2230882 1.0233319
regionYorkshire Region -0.0445755 0.1095514 -0.4068913 0.6840879 -0.2598490 0.1697382 0.9564034
highest_educationLower Than A Level -0.1082411 0.2109380 -0.5131417 0.6078522 -0.5120407 0.3165136 0.8974112
highest_educationA Level or Equivalent -0.4552661 0.2113847 -2.1537330 0.0312611 -0.8600151 -0.0297061 0.6342792
highest_educationHE Qualification -0.3232372 0.2170098 -1.4895049 0.1363545 -0.7395213 0.1127737 0.7238022
highest_educationPost Graduate Qualification -0.9434264 0.3969983 -2.3763990 0.0174825 -1.7588437 -0.1919502 0.3892917
studied_credits.std 0.1506879 0.0239562 6.2901360 0.0000000 0.1036840 0.1975991 1.1626338
disabilityY 0.3169341 0.0652836 4.8547259 0.0000012 0.1882020 0.4441604 1.3729121
code_moduleAAA 0.5776604 0.1529942 3.7757024 0.0001596 0.2705053 0.8710313 1.7818648
code_moduleCCC 1.7441031 0.0780168 22.3554932 0.0000000 1.5915883 1.8974398 5.7207684
code_moduleDDD 1.1432620 0.0695498 16.4380339 0.0000000 1.0071560 1.2798200 3.1369845
code_moduleEEE 0.9916368 0.1197612 8.2801179 0.0000000 0.7540120 1.2237643 2.6956431
code_moduleFFF 1.7234065 0.0779074 22.1212181 0.0000000 1.5711438 1.8765674 5.6035848
code_moduleGGG -0.9675535 0.1256292 -7.7016633 0.0000000 -1.2182596 -0.7252782 0.3800116
semesterJ 0.1097790 0.0429835 2.5539832 0.0106498 0.0256775 0.1941863 1.1160314
year2014 0.3099141 0.0468082 6.6209398 0.0000000 0.2183026 0.4018090 1.3633080
total.clicks.std -1.0147472 0.0248615 -40.8160512 0.0000000 -1.0637216 -0.9662574 0.3624940
num_of_prev_attempts_dich0 0.3667222 0.0620562 5.9095147 0.0000000 0.2459014 0.4891943 1.4429970
comb.score.std -0.1964604 0.0198968 -9.8739570 0.0000000 -0.2354804 -0.1574771 0.8216339
imd_band_num -0.0212797 0.0079137 -2.6889900 0.0071669 -0.0368022 -0.0057784 0.9789451
ggplot(tidym.combined, aes(x = term, y = estimate)) + geom_point() + geom_errorbar(aes(ymin = conf.low, 
    ymax = conf.high)) + theme_bw() + geom_hline(yintercept = 0, linetype = "dotted") + 
    coord_flip() + ggtitle("Factors associated with withdrawal")

10.1.1 Diagnostics and prediction

aug1 <- cbind(combine.feat.unique, predict(m, newdata = combine.feat.unique, 
    type = "link", se = TRUE))
pairs(aug1[, c("fit", "total.clicks.std", "studied_credits.std")])

aug1 <- cbind(combine.feat.unique, predict(m, newdata = combine.feat.unique, 
    type = "response", se = TRUE))
ggplot(aug1, aes(x = fit, fill = factor(withdraw))) + geom_histogram(alpha = 0.5) + 
    theme_bw()

# Chisquare t-test
HLgof.test(fit = aug1$fit[!is.na(aug1$fit)], obs = aug1$withdraw[!is.na(aug1$fit)])
## $C
## 
##  Hosmer-Lemeshow C statistic
## 
## data:  aug1$fit[!is.na(aug1$fit)] and aug1$withdraw[!is.na(aug1$fit)]
## X-squared = 112.38, df = 8, p-value < 2.2e-16
## 
## 
## $H
## 
##  Hosmer-Lemeshow H statistic
## 
## data:  aug1$fit[!is.na(aug1$fit)] and aug1$withdraw[!is.na(aug1$fit)]
## X-squared = 421.26, df = 8, p-value < 2.2e-16
# Roc curve
aug1 <- cbind(combine.feat.unique, predict(m, newdata = combine.feat.unique, 
    type = "response", se = TRUE))
roc.log <- ggplot(aug1, aes(d = withdraw, m = fit)) + geom_roc() + theme_bw()
roc.log + ggtitle("Withdrawal - combined semesters") + annotate("text", x = 0.75, 
    y = 0.25, label = paste("AUC =", round(calc_auc(roc.log)$AUC, 2))) + geom_abline(sslope = 1, 
    intercept = 0, linetype = "dotted") + scale_x_continuous("1 - Specificity", 
    breaks = seq(0, 1, by = 0.1))

11 Logistic on success including both B and J together

11.1 Combined semester

m <- glm(pass_fail ~ gender + region + highest_education + studied_credits.std + 
    disability + code_module + semester + year + total.clicks.std + num_of_prev_attempts_dich + 
    comb.score.std + imd_band_num, data = combine.feat.unique, family = binomial)

car::vif(m)  # looks better
##                               GVIF Df GVIF^(1/(2*Df))
## gender                    1.651181  1        1.284983
## region                    1.339386 12        1.012250
## highest_education         1.177187  4        1.020600
## studied_credits.std       1.581345  1        1.257515
## disability                1.028493  1        1.014146
## code_module               5.429016  6        1.151401
## semester                  1.058644  1        1.028904
## year                      1.154200  1        1.074337
## total.clicks.std          1.783593  1        1.335512
## num_of_prev_attempts_dich 1.075493  1        1.037060
## comb.score.std            1.132933  1        1.064393
## imd_band_num              1.170450  1        1.081873
summary(m)
## 
## Call:
## glm(formula = pass_fail ~ gender + region + highest_education + 
##     studied_credits.std + disability + code_module + semester + 
##     year + total.clicks.std + num_of_prev_attempts_dich + comb.score.std + 
##     imd_band_num, family = binomial, data = combine.feat.unique)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4413  -0.2840   0.3116   0.5779   3.3900  
## 
## Coefficients:
##                                               Estimate Std. Error z value
## (Intercept)                                   1.179180   0.253092   4.659
## genderM                                       0.011299   0.055401   0.204
## regionEast Anglian Region                    -0.137171   0.103076  -1.331
## regionEast Midlands Region                   -0.095360   0.112182  -0.850
## regionIreland                                -0.182945   0.138227  -1.324
## regionLondon Region                          -0.177716   0.104911  -1.694
## regionNorth Region                           -0.064199   0.147565  -0.435
## regionNorth Western Region                   -0.331789   0.107265  -3.093
## regionScotland                               -0.493826   0.104328  -4.733
## regionSouth East Region                       0.060673   0.118565   0.512
## regionSouth Region                            0.098294   0.108015   0.910
## regionWales                                  -0.405618   0.112010  -3.621
## regionWest Midlands Region                   -0.168068   0.110114  -1.526
## regionYorkshire Region                       -0.222582   0.116761  -1.906
## highest_educationLower Than A Level          -0.201374   0.229537  -0.877
## highest_educationA Level or Equivalent        0.358984   0.229907   1.561
## highest_educationHE Qualification             0.341736   0.236268   1.446
## highest_educationPost Graduate Qualification  0.088385   0.378960   0.233
## studied_credits.std                           0.059441   0.027609   2.153
## disabilityY                                  -0.071637   0.074886  -0.957
## code_moduleAAA                                0.209489   0.164701   1.272
## code_moduleCCC                               -1.298808   0.094268 -13.778
## code_moduleDDD                               -0.905796   0.075381 -12.016
## code_moduleEEE                               -1.579899   0.115367 -13.695
## code_moduleFFF                               -2.526675   0.088552 -28.533
## code_moduleGGG                                0.072901   0.095531   0.763
## semesterJ                                     0.439185   0.045080   9.742
## year2014                                      0.405214   0.046507   8.713
## total.clicks.std                              1.763172   0.035822  49.221
## num_of_prev_attempts_dich0                    0.279031   0.064036   4.357
## comb.score.std                                1.098220   0.030293  36.254
## imd_band_num                                  0.033028   0.008342   3.959
##                                              Pr(>|z|)    
## (Intercept)                                  3.18e-06 ***
## genderM                                      0.838394    
## regionEast Anglian Region                    0.183260    
## regionEast Midlands Region                   0.395296    
## regionIreland                                0.185666    
## regionLondon Region                          0.090269 .  
## regionNorth Region                           0.663520    
## regionNorth Western Region                   0.001980 ** 
## regionScotland                               2.21e-06 ***
## regionSouth East Region                      0.608843    
## regionSouth Region                           0.362819    
## regionWales                                  0.000293 ***
## regionWest Midlands Region                   0.126935    
## regionYorkshire Region                       0.056611 .  
## highest_educationLower Than A Level          0.380321    
## highest_educationA Level or Equivalent       0.118421    
## highest_educationHE Qualification            0.148068    
## highest_educationPost Graduate Qualification 0.815583    
## studied_credits.std                          0.031321 *  
## disabilityY                                  0.338761    
## code_moduleAAA                               0.203397    
## code_moduleCCC                                < 2e-16 ***
## code_moduleDDD                                < 2e-16 ***
## code_moduleEEE                                < 2e-16 ***
## code_moduleFFF                                < 2e-16 ***
## code_moduleGGG                               0.445396    
## semesterJ                                     < 2e-16 ***
## year2014                                      < 2e-16 ***
## total.clicks.std                              < 2e-16 ***
## num_of_prev_attempts_dich0                   1.32e-05 ***
## comb.score.std                                < 2e-16 ***
## imd_band_num                                 7.52e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 21908  on 18364  degrees of freedom
## Residual deviance: 13701  on 18333  degrees of freedom
##   (10383 observations deleted due to missingness)
## AIC: 13765
## 
## Number of Fisher Scoring iterations: 5
tidym.combined <- tidy(m, conf.int = TRUE) %>% mutate(term = factor(term, levels = term)) %>% 
    filter(term != "(Intercept)")
ggplot(tidym.combined, aes(x = term, y = estimate)) + geom_point() + geom_errorbar(aes(ymin = conf.low, 
    ymax = conf.high)) + theme_bw() + geom_hline(yintercept = 0, linetype = "dotted") + 
    coord_flip() + ggtitle("Factors associated with withdrawal")

11.2 Diagnostics and prediction

aug1 <- cbind(combine.feat.unique, predict(m, newdata = combine.feat.unique, 
    type = "link", se = TRUE))
pairs(aug1[, c("fit", "total.clicks.std", "studied_credits.std")])

aug1 <- cbind(combine.feat.unique, predict(m, newdata = combine.feat.unique, 
    type = "response", se = TRUE))
ggplot(aug1, aes(x = fit, fill = factor(withdraw))) + geom_histogram(alpha = 0.5) + 
    theme_bw()

# Chisquare t-test
HLgof.test(fit = aug1$fit[!is.na(aug1$fit)], obs = aug1$withdraw[!is.na(aug1$fit)])
## $C
## 
##  Hosmer-Lemeshow C statistic
## 
## data:  aug1$fit[!is.na(aug1$fit)] and aug1$withdraw[!is.na(aug1$fit)]
## X-squared = 276170, df = 8, p-value < 2.2e-16
## 
## 
## $H
## 
##  Hosmer-Lemeshow H statistic
## 
## data:  aug1$fit[!is.na(aug1$fit)] and aug1$withdraw[!is.na(aug1$fit)]
## X-squared = 181090, df = 8, p-value < 2.2e-16
# Roc curve
aug1 <- cbind(combine.feat.unique, predict(m, newdata = combine.feat.unique, 
    type = "response", se = TRUE))
roc.log <- ggplot(aug1, aes(d = withdraw, m = fit)) + geom_roc() + theme_bw()
roc.log + ggtitle("Withdrawal - combined semesters") + annotate("text", x = 0.75, 
    y = 0.25, label = paste("AUC =", round(calc_auc(roc.log)$AUC, 2))) + geom_abline(sslope = 1, 
    intercept = 0, linetype = "dotted") + scale_x_continuous("1 - Specificity", 
    breaks = seq(0, 1, by = 0.1))

12 Withdrawal by semester

12.1 Semester B

  • Remove regions, it was adding coplexity to the model but not really making that much of a difference. Should probably test it.
semB <- subset(combine.feat.unique, semester %in% "B")
m <- glm(withdraw ~ gender + region + highest_education + studied_credits.std + 
    disability + code_module + year + total.clicks.std + num_of_prev_attempts_dich + 
    comb.score.std + imd_band_num, data = semB, family = binomial)

car::vif(m)  # oooks better
##                               GVIF Df GVIF^(1/(2*Df))
## gender                    1.549976  1        1.244980
## region                    1.300428 12        1.011006
## highest_education         1.175302  4        1.020396
## studied_credits.std       1.438013  1        1.199172
## disability                1.035898  1        1.017791
## code_module               3.960373  5        1.147555
## year                      1.293596  1        1.137364
## total.clicks.std          1.618914  1        1.272365
## num_of_prev_attempts_dich 1.284724  1        1.133456
## comb.score.std            1.406154  1        1.185813
## imd_band_num              1.194279  1        1.092831
summary(m)
## 
## Call:
## glm(formula = withdraw ~ gender + region + highest_education + 
##     studied_credits.std + disability + code_module + year + total.clicks.std + 
##     num_of_prev_attempts_dich + comb.score.std + imd_band_num, 
##     family = binomial, data = semB)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6309  -0.5559  -0.3925  -0.2479   2.9628  
## 
## Coefficients:
##                                                Estimate Std. Error z value
## (Intercept)                                  -2.3198772  0.3298462  -7.033
## genderM                                      -0.3448773  0.0831752  -4.146
## regionEast Anglian Region                    -0.0913176  0.1513791  -0.603
## regionEast Midlands Region                    0.1375766  0.1608466   0.855
## regionIreland                                -0.0504522  0.2382082  -0.212
## regionLondon Region                          -0.2162457  0.1550372  -1.395
## regionNorth Region                           -0.2380043  0.2286610  -1.041
## regionNorth Western Region                   -0.0981280  0.1585165  -0.619
## regionScotland                               -0.1374326  0.1593822  -0.862
## regionSouth East Region                      -0.0008147  0.1714789  -0.005
## regionSouth Region                           -0.1665937  0.1576403  -1.057
## regionWales                                  -0.4714802  0.1775146  -2.656
## regionWest Midlands Region                   -0.0681321  0.1617427  -0.421
## regionYorkshire Region                       -0.0396765  0.1730472  -0.229
## highest_educationLower Than A Level          -0.4382973  0.2901558  -1.511
## highest_educationA Level or Equivalent       -0.8023334  0.2911118  -2.756
## highest_educationHE Qualification            -0.6809019  0.3022786  -2.253
## highest_educationPost Graduate Qualification -2.7591176  1.0648012  -2.591
## studied_credits.std                           0.0178575  0.0423624   0.422
## disabilityY                                   0.3045867  0.1056286   2.884
## code_moduleCCC                                1.5328302  0.1369064  11.196
## code_moduleDDD                                1.2213514  0.1148741  10.632
## code_moduleEEE                                0.9265971  0.2674343   3.465
## code_moduleFFF                                1.5918358  0.1274617  12.489
## code_moduleGGG                               -1.0484223  0.2091110  -5.014
## year2014                                      0.0053164  0.0785677   0.068
## total.clicks.std                             -0.9312455  0.0422302 -22.052
## num_of_prev_attempts_dich0                    0.3862757  0.1064487   3.629
## comb.score.std                               -0.2547609  0.0349667  -7.286
## imd_band_num                                  0.0017447  0.0129230   0.135
##                                              Pr(>|z|)    
## (Intercept)                                  2.02e-12 ***
## genderM                                      3.38e-05 ***
## regionEast Anglian Region                    0.546350    
## regionEast Midlands Region                   0.392370    
## regionIreland                                0.832264    
## regionLondon Region                          0.163077    
## regionNorth Region                           0.297940    
## regionNorth Western Region                   0.535890    
## regionScotland                               0.388532    
## regionSouth East Region                      0.996209    
## regionSouth Region                           0.290605    
## regionWales                                  0.007907 ** 
## regionWest Midlands Region                   0.673582    
## regionYorkshire Region                       0.818650    
## highest_educationLower Than A Level          0.130901    
## highest_educationA Level or Equivalent       0.005849 ** 
## highest_educationHE Qualification            0.024287 *  
## highest_educationPost Graduate Qualification 0.009564 ** 
## studied_credits.std                          0.673359    
## disabilityY                                  0.003932 ** 
## code_moduleCCC                                < 2e-16 ***
## code_moduleDDD                                < 2e-16 ***
## code_moduleEEE                               0.000531 ***
## code_moduleFFF                                < 2e-16 ***
## code_moduleGGG                               5.34e-07 ***
## year2014                                     0.946051    
## total.clicks.std                              < 2e-16 ***
## num_of_prev_attempts_dich0                   0.000285 ***
## comb.score.std                               3.20e-13 ***
## imd_band_num                                 0.892604    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7194.3  on 8162  degrees of freedom
## Residual deviance: 5923.1  on 8133  degrees of freedom
##   (2651 observations deleted due to missingness)
## AIC: 5983.1
## 
## Number of Fisher Scoring iterations: 6
tidyB <- tidy(m, conf.int = TRUE) %>% mutate(term = factor(term, levels = term)) %>% 
    filter(term != "(Intercept)")

ggplot(tidyB, aes(x = term, y = estimate)) + geom_point() + geom_errorbar(aes(ymin = conf.low, 
    ymax = conf.high)) + theme_bw() + geom_hline(yintercept = 0, linetype = "dotted") + 
    coord_flip() + ggtitle("Factors associated with withdrawal")

aug1 <- cbind(semB, predict(m, newdata = semB, type = "response", se = TRUE))

ggplot(aug1, aes(x = fit, fill = factor(withdraw))) + geom_histogram(alpha = 0.5) + 
    theme_bw()

HLgof.test(fit = aug1$fit[!is.na(aug1$fit)], obs = aug1$withdraw[!is.na(aug1$fit)])
## $C
## 
##  Hosmer-Lemeshow C statistic
## 
## data:  aug1$fit[!is.na(aug1$fit)] and aug1$withdraw[!is.na(aug1$fit)]
## X-squared = 52.208, df = 8, p-value = 1.534e-08
## 
## 
## $H
## 
##  Hosmer-Lemeshow H statistic
## 
## data:  aug1$fit[!is.na(aug1$fit)] and aug1$withdraw[!is.na(aug1$fit)]
## X-squared = 165.94, df = 8, p-value < 2.2e-16

12.2 Semester J

semJ <- subset(combine.feat.unique, semester %in% "J")
m <- glm(withdraw ~ gender + region + highest_education + studied_credits.std + 
    disability + code_module + year + total.clicks.std + num_of_prev_attempts_dich + 
    comb.score.std + imd_band_num, data = semJ, family = binomial)

car::vif(m)  # oooks better
##                               GVIF Df GVIF^(1/(2*Df))
## gender                    1.473383  1        1.213830
## region                    1.313810 12        1.011437
## highest_education         1.189169  4        1.021893
## studied_credits.std       1.232645  1        1.110245
## disability                1.038408  1        1.019023
## code_module               2.888944  6        1.092433
## year                      1.163116  1        1.078479
## total.clicks.std          1.442007  1        1.200836
## num_of_prev_attempts_dich 1.067219  1        1.033063
## comb.score.std            1.147195  1        1.071072
## imd_band_num              1.165175  1        1.079433
summary(m)
## 
## Call:
## glm(formula = withdraw ~ gender + region + highest_education + 
##     studied_credits.std + disability + code_module + year + total.clicks.std + 
##     num_of_prev_attempts_dich + comb.score.std + imd_band_num, 
##     family = binomial, data = semJ)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0559  -0.5406  -0.3575  -0.2178   3.3807  
## 
## Coefficients:
##                                               Estimate Std. Error z value
## (Intercept)                                  -3.220820   0.340757  -9.452
## genderM                                      -0.311627   0.063823  -4.883
## regionEast Anglian Region                     0.008632   0.124906   0.069
## regionEast Midlands Region                   -0.000650   0.134808  -0.005
## regionIreland                                -0.258924   0.173611  -1.491
## regionLondon Region                          -0.172297   0.129660  -1.329
## regionNorth Region                            0.337058   0.163682   2.059
## regionNorth Western Region                   -0.042584   0.129008  -0.330
## regionScotland                               -0.012856   0.122959  -0.105
## regionSouth East Region                       0.109048   0.136554   0.799
## regionSouth Region                            0.198870   0.125264   1.588
## regionWales                                  -0.206181   0.137723  -1.497
## regionWest Midlands Region                    0.106182   0.132399   0.802
## regionYorkshire Region                       -0.017462   0.142660  -0.122
## highest_educationLower Than A Level           0.220210   0.311740   0.706
## highest_educationA Level or Equivalent       -0.117983   0.312117  -0.378
## highest_educationHE Qualification             0.014535   0.318247   0.046
## highest_educationPost Graduate Qualification -0.143236   0.486977  -0.294
## studied_credits.std                           0.229344   0.030399   7.545
## disabilityY                                   0.326086   0.083523   3.904
## code_moduleAAA                                0.608535   0.160047   3.802
## code_moduleCCC                                1.821713   0.097862  18.615
## code_moduleDDD                                1.025501   0.089300  11.484
## code_moduleEEE                                1.045374   0.138493   7.548
## code_moduleFFF                                1.757835   0.099957  17.586
## code_moduleGGG                               -0.841453   0.160022  -5.258
## year2014                                      0.508166   0.060214   8.439
## total.clicks.std                             -1.075798   0.031327 -34.341
## num_of_prev_attempts_dich0                    0.313237   0.079850   3.923
## comb.score.std                               -0.171248   0.024985  -6.854
## imd_band_num                                 -0.034820   0.010077  -3.455
##                                              Pr(>|z|)    
## (Intercept)                                   < 2e-16 ***
## genderM                                      1.05e-06 ***
## regionEast Anglian Region                    0.944907    
## regionEast Midlands Region                   0.996153    
## regionIreland                                0.135857    
## regionLondon Region                          0.183900    
## regionNorth Region                           0.039473 *  
## regionNorth Western Region                   0.741332    
## regionScotland                               0.916729    
## regionSouth East Region                      0.424538    
## regionSouth Region                           0.112375    
## regionWales                                  0.134374    
## regionWest Midlands Region                   0.422558    
## regionYorkshire Region                       0.902578    
## highest_educationLower Than A Level          0.479946    
## highest_educationA Level or Equivalent       0.705424    
## highest_educationHE Qualification            0.963573    
## highest_educationPost Graduate Qualification 0.768657    
## studied_credits.std                          4.54e-14 ***
## disabilityY                                  9.46e-05 ***
## code_moduleAAA                               0.000143 ***
## code_moduleCCC                                < 2e-16 ***
## code_moduleDDD                                < 2e-16 ***
## code_moduleEEE                               4.41e-14 ***
## code_moduleFFF                                < 2e-16 ***
## code_moduleGGG                               1.45e-07 ***
## year2014                                      < 2e-16 ***
## total.clicks.std                              < 2e-16 ***
## num_of_prev_attempts_dich0                   8.75e-05 ***
## comb.score.std                               7.18e-12 ***
## imd_band_num                                 0.000550 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12328.7  on 13781  degrees of freedom
## Residual deviance:  9638.4  on 13751  degrees of freedom
##   (4152 observations deleted due to missingness)
## AIC: 9700.4
## 
## Number of Fisher Scoring iterations: 6
tidyJ <- tidy(m, conf.int = TRUE) %>% mutate(term = factor(term, levels = term)) %>% 
    filter(term != "(Intercept)")

ggplot(tidyJ, aes(x = term, y = estimate)) + geom_point() + geom_errorbar(aes(ymin = conf.low, 
    ymax = conf.high)) + theme_bw() + geom_hline(yintercept = 0, linetype = "dotted") + 
    coord_flip() + ggtitle("Factors associated with withdrawal - J")

aug1 <- cbind(semJ, predict(m, newdata = semJ, type = "response", se = TRUE))

ggplot(aug1, aes(x = fit, fill = factor(withdraw))) + geom_histogram(alpha = 0.5) + 
    theme_bw()

HLgof.test(fit = aug1$fit[!is.na(aug1$fit)], obs = aug1$withdraw[!is.na(aug1$fit)])
## $C
## 
##  Hosmer-Lemeshow C statistic
## 
## data:  aug1$fit[!is.na(aug1$fit)] and aug1$withdraw[!is.na(aug1$fit)]
## X-squared = 51.488, df = 8, p-value = 2.113e-08
## 
## 
## $H
## 
##  Hosmer-Lemeshow H statistic
## 
## data:  aug1$fit[!is.na(aug1$fit)] and aug1$withdraw[!is.na(aug1$fit)]
## X-squared = 254.64, df = 8, p-value < 2.2e-16

12.3 Compare B and J: difference of estimates

tidyB <- tidyB %>% mutate(semester = "B")

tidyJ <- tidyJ %>% mutate(semester = "J")

# Differences of betas
tidy.compare.beta <- bind_rows(tidyB, tidyJ) %>% dplyr::select(term, estimate, 
    semester) %>% spread(key = semester, value = estimate) %>% mutate(diff.estimate = B - 
    J)

# Uncertainty of difference
tidy.compare.std <- bind_rows(tidyB, tidyJ) %>% dplyr::select(term, std.error, 
    semester) %>% spread(key = semester, value = std.error) %>% group_by(term) %>% 
    mutate(diff.std = sqrt(B^2 + J^2)) %>% dplyr::select(-B, -J)

# Combine and CI
tidy.compare <- tidy.compare.beta %>% left_join(tidy.compare.std) %>% mutate(diff.estimate = diff.estimate/diff.std) %>% 
    mutate(conf.low = diff.estimate - 1.96 * diff.std, conf.high = diff.estimate + 
        1.96 * diff.std)

ggplot(tidy.compare, aes(x = term, y = diff.estimate)) + geom_point() + geom_errorbar(aes(ymin = conf.low, 
    ymax = conf.high), width = 0.5) + theme_bw() + geom_hline(yintercept = 0, 
    linetype = "dotted") + coord_flip() + labs(y = "B - J") + ggtitle("Differences between semesters")

13 Linear model on score - not used

Now I will study what factors influence the total score obtained with assignments and exams.

It is slightly skewed.

hist(combine.feat.unique$comb.score)

13.1 Combined by semester

m <- lm(comb.score ~ gender + region + highest_education + studied_credits.std + 
    disability + code_module + semester + year + total.clicks.std + num_of_prev_attempts_dich + 
    final_result, data = combine.feat.unique)

car::vif(m)

par(mfrow = c(2, 2))
plot(m)
  • diagnostics
library(MASS)

qqnorm(combine.feat.unique$comb.score,
       ylab="Quantiles for comb.score")
qqline(combine.feat.unique$comb.score, 
       col="red")

Box = boxcox(combine.feat.unique$comb.score + 1 ~ 1,              # Transform Turbidity as a single vector
             lambda = seq(-6,6,0.1))

Cox = data.frame(Box$x, Box$y)            # Create a data frame with the results

Cox2 = Cox[with(Cox, order(-Cox$Box.y)),] # Order the new data frame by decreasing y

Cox2[1,]                                  # Display the lambda with the greatest
                                          #    log likelihood

lambda = Cox2[1, "Box.x"]                 # Extract that lambda

combine.feat.unique$comb.score.box = (combine.feat.unique$comb.score ^ lambda - 1)/lambda   # Transform the original data

combine.feat.unique$comb.score.box.st <- scale(combine.feat.unique$comb.score.box,center = TRUE,scale = TRUE)

qqnorm(combine.feat.unique$comb.score.box,
       ylab="Quantiles for comb.score")
qqline(combine.feat.unique$comb.score.box, 
       col="red")
m <- lm(comb.score.std ~ gender + region + highest_education + studied_credits.std + 
    disability + code_module + semester + year + total.clicks.std + num_of_prev_attempts_dich, 
    data = combine.feat.unique)

par(mfrow = c(2, 4))
plot(m)

# After box-cox
m <- lm(comb.score.box.st ~ gender + region + highest_education + studied_credits.std + 
    disability + code_module + semester + year + total.clicks.std + num_of_prev_attempts_dich, 
    data = combine.feat.unique)

plot(combine.feat.unique$comb.score.box.st, combine.feat.unique$comb.score)

plot(m)

car::vif(m)


tidym <- tidy(m, conf.int = TRUE)

tidym <- tidym %>% mutate(term = factor(term, levels = term)) %>% filter(term != 
    "(Intercept)")

ggplot(tidym, aes(x = term, y = estimate, colour = -log10(p.value))) + geom_point() + 
    geom_errorbar(aes(ymin = conf.low, ymax = conf.high)) + theme_bw() + geom_hline(yintercept = 0, 
    linetype = "dotted") + coord_flip() + ggtitle("Factors associated with total score")

13.2 Semester B

m <- lm(comb.score.std ~ gender + region + highest_education + studied_credits.std + 
    disability + code_module + year + total.clicks.std + num_of_prev_attempts_dich, 
    data = combine.feat.unique[combine.feat.unique$semester %in% "B", ])

par(mfrow = c(2, 4))
plot(m)

# After box-cox
m <- lm(comb.score.box.st ~ gender + region + highest_education + studied_credits.std + 
    disability + code_module + year + total.clicks.std + num_of_prev_attempts_dich, 
    data = combine.feat.unique[combine.feat.unique$semester %in% "B", ])

par(mfrow = c(2, 2))
plot(m)

car::vif(m)
  • From the plot of residuals and quite high correlation between the y variable and the residuals it looks like we are not explaining that much and indeed the Rsquared of the model is just 0.2.
aug <- augment(m)

pairs(aug[, c(".resid", ".fitted", "total.clicks.std", ".std.resid", "studied_credits.std", 
    "comb.score.box.st")])

ggplot(aug, aes(x = code_module, y = .resid, fill = region)) + geom_boxplot()
# After box-cox
m <- lm(comb.score.box.st ~ gender + highest_education + total.clicks.std + 
    num_of_prev_attempts_dich, data = combine.feat.unique[combine.feat.unique$semester %in% 
    "B", ])

summary(m)

par(mfrow = c(2, 2))
plot(m)

aug <- augment(m)

pairs(aug[, c(".resid", ".fitted", "total.clicks.std", ".std.resid", "comb.score.box.st")])

13.3 Semester J

m <- lm(comb.score.std ~ gender + region + highest_education + studied_credits.std + 
    disability + code_module + year + total.clicks.std + num_of_prev_attempts_dich, 
    data = combine.feat.unique[combine.feat.unique$semester %in% "J", ])

summary(m)

par(mfrow = c(2, 2))
plot(m)

# After box-cox
m <- lm(comb.score.box.st ~ gender + highest_education + studied_credits.std + 
    disability + code_module + year + total.clicks.std + num_of_prev_attempts_dich, 
    data = combine.feat.unique[combine.feat.unique$semester %in% "J", ])

par(mfrow = c(2, 2))
plot(m)

summary(m)

car::vif(m)
  • Same thing for this semester
aug <- augment(m)

pairs(aug[, c(".resid", ".fitted", "total.clicks.std", ".std.resid", "studied_credits.std", 
    "comb.score.box.st")])

14 Purl

knitr::purl("nous-student.Rmd", output = "nous-student.R", documentation = 2)